Kembali ke Pengantar Sains Data
Versi file .R
dari modul ini bisa diunduh: Modul 2 (REV).R
Visualisasi Tambahan
STEM AND LEAF PLOT
Untuk ngeliat persebaran data (min,max,distribusi), mirip kaya histogram
?stem
View(ChickWeight) #data dari r
stem(ChickWeight$weight)
The decimal point is 1 digit(s) to the right of the |
2 | 599999999
4 | 00000111111111111111111112222222222222223333456678888888899999999999+38
6 | 00111111122222222333334444455555666677777888888900111111222222333334+8
8 | 00112223344444455555566777788999990001223333566666788888889
10 | 0000111122233333334566667778889901122223445555667789
12 | 00002223333344445555667788890113444555566788889
14 | 11123444455556666677788890011234444555666777777789
16 | 00002233334444466788990000134445555789
18 | 12244444555677782225677778889999
20 | 0123444555557900245578
22 | 0012357701123344556788
24 | 08001699
26 | 12344569259
28 | 01780145
30 | 355798
32 | 12712
34 | 1
36 | 13
hist(ChickWeight$weight) #buat perbandingan
min(ChickWeight$weight) #cek nilai minimumnya
max(ChickWeight$weight) #cek maxnya
Kalau ternyata min sama maxnya kurang tepat, ganti scalenya
stem(ChickWeight$weight, scale = 5)
The decimal point is 1 digit(s) to the right of the |
3 | 599999999
4 | 000001111111111111111111122222222222222233334
4 | 5667888888889999999999999
5 | 00000011111111222233333444
5 | 5555566667778888899999
6 | 001111111222222223333344444
6 | 555556666777778888889
7 | 001111112222223333344444444
7 | 6667778889999
8 | 001122233444444
8 | 5555556677778899999
9 | 0001223333
9 | 566666788888889
10 | 0000111122233333334
10 | 5666677788899
11 | 0112222344
11 | 5555667789
12 | 0000222333334444
12 | 555566778889
13 | 0113444
13 | 555566788889
14 | 111234444
14 | 5555666667778889
15 | 0011234444
15 | 555666777777789
16 | 000022333344444
16 | 6678899
17 | 000013444
17 | 5555789
18 | 12244444
18 | 55567778
19 | 222
19 | 5677778889999
20 | 0123444
20 | 5555579
21 | 0024
21 | 5578
22 | 00123
22 | 577
23 | 01123344
23 | 556788
24 | 0
24 | 8
25 | 001
25 | 699
26 | 12344
26 | 569
27 | 2
27 | 59
28 | 01
28 | 78
29 | 014
29 | 5
30 | 3
30 | 5579
31 |
31 | 8
32 | 12
32 | 7
33 | 12
33 |
34 | 1
34 |
35 |
35 |
36 | 1
36 |
37 | 3
stem(ChickWeight$weight, width = 100)
The decimal point is 1 digit(s) to the right of the |
2 | 599999999
4 | 0000011111111111111111111222222222222222333345667888888889999999999999000000111111112222+18
6 | 0011111112222222233333444445555566667777788888890011111122222233333444444446667778889999
8 | 00112223344444455555566777788999990001223333566666788888889
10 | 0000111122233333334566667778889901122223445555667789
12 | 00002223333344445555667788890113444555566788889
14 | 11123444455556666677788890011234444555666777777789
16 | 00002233334444466788990000134445555789
18 | 12244444555677782225677778889999
20 | 0123444555557900245578
22 | 0012357701123344556788
24 | 08001699
26 | 12344569259
28 | 01780145
30 | 355798
32 | 12712
34 | 1
36 | 13
DOT DIAGRAM
Paling Sederhana
Tambah Judul dan Label
dotchart(mtcars$mpg, labels = row.names(mtcars),
cex = 0.9, xlab = "mpg",
main = "Persebaran Jarak Yang Dapat Ditempuh Per Galon")
Berdasarkan Grup
grps <- as.factor(mtcars$cyl)
my_cols <- c("blue", "darkgreen", "orange")
dotchart(mtcars$mpg, labels = row.names(mtcars),
groups = grps, gcolor = my_cols,
color = my_cols[grps],
cex = 0.9, pch = 22, xlab = "mpg",
main = "Persebaran Jarak Yang Dapat Ditempuh Per Galon Berdasarkan Jumlah Silinder")
legend("bottomright", legend = c("4","6", "6"),
fill = my_cols, cex = 0.8)
Distribusi Diskirt
plot pdf, cdf, dan data yang dibangkitkan dari distribusi tersebut
Distribusi Diskrit: domainnya (dalam hal ini ruang sampel) hanya bisa diskrit
pdf -> Pr(X=x)
cdf -> Pr(X <= k)
Attaching package: 'Rlab'
The following objects are masked from 'package:stats':
dexp, dgamma, dweibull, pexp, pgamma, pweibull, qexp, qgamma,
qweibull, rexp, rgamma, rweibull
The following object is masked from 'package:datasets':
precip
Bernoulli
PDF
dbern(0, prob = 0.3) #0 adalah domain, atau bisa dikatakan akan dihitung f(0)
#parameter prob adalah probabilitas sukses atau f(1)
plot pdf
x <- seq(0, 1, by = 1)
plot(dbern(x, prob = 0.6))
visualisasi masih jelek,
improve plotnya coba (terutama perhatiin sumbu x nya) -> bukan domain tapi index doang
plot(x,
dbern(x, prob = 0.6),
main = "PDF Distribusi Bernoulli dengan p = 0.6",
xlab = "x",
ylab = "f(x)",
ylim = c(0, 1),
pch = 20,
cex = 2)
plot(x,
dbern(x, prob = 0.6),
main = "PDF Distribusi Bernoulli dengan p = 0.6",
xlab = "x",
ylab = "f(x)",
ylim = c(0, 1),
pch = 20,
cex = 2,
type = "o")
hindari plot seperti ini dalam distribusi diskrit, kenapa?
balik lagi, domainnya diskrit jadi harusnya ga terdefinisi untuk 0 < x < 1 sehingga harusnya tidak boleh dihubungkan oleh garis lurus
kalo pake garis putus2 masih oke lah, tapi kurang recommend untuk distribusi diskrit:
plot(x,
dbern(x, prob = 0.6),
main = "PDF Distribusi Bernoulli dengan p = 0.4",
xlab = "x",
ylab = "f(x)",
ylim = c(0, 1),
pch = 20,
cex = 2,
type = "o",
lty = 2)
plot(x2,
dbern(x2, prob = 0.6),
pch = 20,
cex = 2,
xaxp = c(0,10,10))
apa kesimpulannya? untuk x = 2,3,4,… f(x) = 0
CDF
plot(x2,
pbern(x2, prob = 0.6),
pch = 20,
cex = 2,
xaxp = c(0,10,10))
apa kesimpulannya? untuk x = 1,2,3,... F(x) = 1 else F(x) = 1-p
Random
bangkitkan n data dari distribusi bernoulli
set.seed(122)
n <- 100
random_bern <- rbern(n, prob = 0.6)
table(random_bern)/sum(table(random_bern))
random_bern
0 1
0.42 0.58
random_bern_plot <- barplot(table(random_bern),
ylim = c(0,100))
text(x = random_bern_plot,
y = table(random_bern),
labels = table(random_bern),
pos = 3)
Distribusi Binomial
PDF
dbinom(2,
size = 10,
prob = 0.6) #artinya adalah Pr(X=2) dimana X ~ binom(10, 0.6)
plot(x2,
dbinom(x2,10,0.4),
xaxp = c(0,10,10),
pch = 20,
ylim = c(0, 0.3)) #plot pdf
CDF
pbinom(2, size = 10, prob = 0.6)
plot(x2,
pbinom(x2,10,0.4),
xaxp = c(0,10,10),
pch = 20,
ylim = c(0, 1))
Random
bangkitkan n data dari distribusi binomial
set.seed(122)
n <- 100
random_binom <- rbinom(n, size = 10, prob = 0.6)
table(random_binom)/sum(table(random_binom))
random_binom
3 4 5 6 7 8 9 10
0.05 0.14 0.17 0.27 0.21 0.08 0.06 0.02
random_binom_plot <- barplot(table(random_binom),
ylim = c(0,40))
text(x = random_binom_plot,
y = table(random_binom),
labels = table(random_binom),
pos = 3)
Distribusi lain
untuk distribusi lain, intinya tetap sama hanya sesuaikan parameternya saja
format:
pdf -> d+nama distribusi()
misal pdf poisson berarti dpois()
cdf poisson: ppois()
data random dari distribusi poisson rpois()
selengkapnya bisa cek di dokumentasi